setwd("/home/armand/Desktop/phylophile_writeup/benchmarks/lanl_POL_CDS/selenium")

infile <- read.csv("timeFile.csv", header = FALSE, stringsAsFactors = FALSE)

library(reshape2)
library(ggplot2)
library(gridExtra)


colnames(infile) <- c("N_in", "process", "time")
#connvert N_in to actual values
infile$N_in <- as.integer(substr(infile$N_in, 19, length(infile$N_in)-14))

##blast hits vs queries
#create a new df with the N_in and N_blst_hits 
blastHits <- infile[infile$process=="blastHits",][,-2]
colnames(blastHits) <- c("N_in", "N_blst")


#creat on theme for ggplot to use thoughout
phyloPiTheme <- theme_bw(base_size = 14) +
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())


#plot and fit N_blst against N_in, force through 0
fit <-  lm(blastHits$N_blst ~ blastHits$N_in-1)
coefficients(fit)
## blastHits$N_in 
##       4.628026
summary(fit)
## 
## Call:
## lm(formula = blastHits$N_blst ~ blastHits$N_in - 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.401  -0.121   2.497   4.735   8.183 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## blastHits$N_in  4.62803    0.02803   165.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.808 on 49 degrees of freedom
## Multiple R-squared:  0.9982, Adjusted R-squared:  0.9982 
## F-statistic: 2.726e+04 on 1 and 49 DF,  p-value: < 2.2e-16
nBlstPlot <- ggplot(blastHits, aes(x = N_in, y = N_blst)) +
  geom_point(shape = 1) + 
  geom_smooth(method=lm, se = FALSE, colour='black',formula=y~x-1, size = 0.25) + 
  labs(x="Number of input sequences", y="Number of sequences retrieved by BLAST") +
  annotate("text", x=41, y=72, label = "y == 4.628 * x", parse=T)+
  annotate("text", x=40, y=60, label = "R^2 == 0.998", parse=T) +
  phyloPiTheme
nBlstPlot

ggsave(file="blast hits vs queries.svg", plot=nBlstPlot)
## Saving 7 x 5 in image
#parse the dataframe for blast
blast <- infile[infile$process=="blast",][-2]
blast <- cbind(blastHits, blast)[-3]

fitBlastT <-  lm(blast$time ~ blast$N_in-1)
coefficients(fitBlastT)
## blast$N_in 
##   11.02104
summary(fitBlastT)
## 
## Call:
## lm(formula = blast$time ~ blast$N_in - 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7791 -1.3727 -0.4803  0.5859  4.2044 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## blast$N_in 11.021038   0.008848    1246   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.833 on 49 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.552e+06 on 1 and 49 DF,  p-value: < 2.2e-16
blastT <- ggplot(blast, aes(x = N_in, y = time)) + 
  geom_point(shape = 1) +
  geom_smooth(method=lm, se = FALSE, colour='black',formula=y~x-1, size = 0.25) + 
  labs(x="Number of input sequences", y="time (s)") +
  annotate("text", x=40, y=200, label = "y == 11.02 * x", parse=T)+
  annotate("text", x=40, y=170, label = "R^2 == 1", parse=T) +
  phyloPiTheme
blastT

ggsave(file="blast time.svg", plot = blastT)
## Saving 7 x 5 in image
#parse the dataframe for mafft
mafft <- infile[infile$process=="mafftTime",][-2]
mafft <- cbind(blastHits, mafft)[-3]

#plot and fit time against N-blst, mafft
t <- mafft$time
N <- mafft$N_blst
fit <- nls(t~b * N**a, start = list(a=2,b=0.1))
cor(t,predict(fit))
## [1] 0.9993305
summary(fit)
## 
## Formula: t ~ b * N^a
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a  1.91841    0.01723   111.3  < 2e-16 ***
## b  0.15319    0.01380    11.1  7.5e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58.09 on 48 degrees of freedom
## 
## Number of iterations to convergence: 7 
## Achieved convergence tolerance: 2.583e-08
mafftT <- ggplot(mafft, aes(x=N_blst, y=time))+
  geom_point(shape = 1) +
  geom_smooth(method="nls",
              formula = y ~ b * x**a, 
              method.args=list(start = c(a = 2, b = 0.1)), 
              se = FALSE, colour='black', size = 0.25) +
  labs(x="Number of sequences in alignment", y="time (s)") +
  annotate("text", x=190, y=1800, label = "y == 0.153 * x^1.92", parse=T)+
  phyloPiTheme 
  
mafftT

ggsave(file="mafftTime.svg", plot=mafftT)
## Saving 7 x 5 in image
y <- mafft$time
x <- mafft$N_blst
fitMafft <- nls(y ~ b * x**a, start = list(a = 2, b = 0.1))
plot(x=mafft$N_blst, y=mafft$time)
lines(x, predict(fitMafft))

plot(x, resid(fitMafft))
abline(h=0)

qqnorm(resid(fitMafft))
qqline(resid(fitMafft))

#plot for fastTree
#for number of queries in input
fastTreeNq <- infile[infile$process == "fasttreeTime",][-2]
#add blastHits here
fastTreeWithBlstHits <- cbind(fastTreeNq,blastHits)[-3]



##an of time vs input query
fitFastTreeNq <-  lm(fastTreeNq$time ~ blast$N_in-1)
coefficients(fitFastTreeNq)
## blast$N_in 
##   3.021342
summary(fitFastTreeNq)
## 
## Call:
## lm(formula = fastTreeNq$time ~ blast$N_in - 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.308  -6.781  -2.839   2.161  19.742 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## blast$N_in  3.02134    0.03424   88.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.094 on 49 degrees of freedom
## Multiple R-squared:  0.9937, Adjusted R-squared:  0.9936 
## F-statistic:  7786 on 1 and 49 DF,  p-value: < 2.2e-16
fastTreeNqP <- ggplot(fastTreeNq, aes(x = N_in, y = time)) + 
  geom_point(shape = 1) +
  geom_smooth(method=lm, se = FALSE, colour='black',formula=y~x-1, size = 0.25) + 
  labs(x="Number of query sequences", y="time (s)") +
  annotate("text", x=40, y=70, label = "y == 3.02 * x", parse=T)+
  annotate("text", x=40, y=60, label = "R^2 == .994", parse=T)+
  phyloPiTheme
fastTreeNqP

ggsave(file="fastTree query sequences.svg", plot = fastTreeNqP)
## Saving 7 x 5 in image
##an of time vs blast results, thus number of samples in analysis
fitFastTreeWithBlstHits <-  lm(fastTreeWithBlstHits$time ~ fastTreeWithBlstHits$N_blst -1)
coefficients(fitFastTreeWithBlstHits)
## fastTreeWithBlstHits$N_blst 
##                   0.6520651
summary(fitFastTreeWithBlstHits)
## 
## Call:
## lm(formula = fastTreeWithBlstHits$time ~ fastTreeWithBlstHits$N_blst - 
##     1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.847  -6.985  -3.594   0.281  17.291 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## fastTreeWithBlstHits$N_blst 0.652065   0.007718   84.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.407 on 49 degrees of freedom
## Multiple R-squared:  0.9932, Adjusted R-squared:  0.993 
## F-statistic:  7139 on 1 and 49 DF,  p-value: < 2.2e-16
fastTreeWithBlastHitsP <- ggplot(fastTreeWithBlstHits, aes(x = N_blst, y = time)) + 
  geom_point(shape = 1) +
  geom_smooth(method=lm, se = FALSE, colour='black',formula=y~x-1, size = 0.25) + 
  labs(x="Number of sequences in alingment", y="time (s)") +
  annotate("text", x=170, y=60, label = "y == 0.652 * x", parse=T)+
  annotate("text", x=170, y=50, label = "R^2 == .993", parse=T)+
  phyloPiTheme
fastTreeWithBlastHitsP

ggsave(file="fastTree total sequences.svg", plot = fastTreeWithBlastHitsP)
## Saving 7 x 5 in image
figure <- grid.arrange(nBlstPlot, blastT, mafftT, fastTreeWithBlastHitsP, nrow = 2, ncol = 2)

figure
## TableGrob (2 x 2) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
ggsave(file = "benchFig.svg",plot = figure, width = 7.66, height = 10)